## Loading required package: ggplot2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## corrplot 0.95 loaded
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
## Loading required package: lattice
## Package 'mclust' version 6.1.1
## Type 'citation("mclust")' for citing this R package in publications.
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
## 'data.frame': 3000 obs. of 17 variables:
## $ userid : int 1 2 3 4 5 6 7 8 9 10 ...
## $ date.crea : chr "9/17/2011" "1/17/2017" "5/14/2019" "11/27/2015" ...
## $ score : num 1.5 8.95 2.5 2.82 2.12 ...
## $ n.matches : int 11 56 13 32 21 14 10 9 6 31 ...
## $ n.updates.photo: int 5 2 3 5 1 2 1 1 -1 2 ...
## $ n.photos : int 6 6 4 2 4 6 4 3 4 5 ...
## $ last.connex : chr "10/7/2011" "1/31/2017" "6/17/2019" "1/15/2016" ...
## $ last.up.photo : chr "10/2/2011" "2/3/2017" "6/19/2019" "12/9/2015" ...
## $ last.pr.update : logi NA NA NA NA NA NA ...
## $ gender : int 1 1 1 0 0 1 0 0 1 1 ...
## $ sent.ana : num 6.49 4.59 6.47 5.37 5.57 ...
## $ length.prof : num 0 20.7 31.4 0 38.5 ...
## $ voyage : int 0 0 0 0 0 0 1 1 0 1 ...
## $ laugh : int 0 0 0 0 1 0 0 0 0 0 ...
## $ photo.elevator : int 0 0 0 0 0 0 0 0 0 0 ...
## $ photo.beach : int 0 1 1 1 0 0 1 0 0 0 ...
## $ Country : chr "France" "Germany" "England" "France" ...
Subsetting the data to preform MCA and PCA:
# Removing date columns that won't be used for analysis
date_cols <- c("last.up.photo", "last.pr.update", "last.connex", "date.crea","Country","userid")
tinder <- tinder %>%
select(-all_of(date_cols))
# Define the qualitative variable names
qualitative <- c( "photo.beach", "photo.elevator", "laugh", "voyage", "gender")
# Select qualitative columns from the `tinder` DataFrame
tinder_qualitative <- tinder %>%
select(all_of(qualitative))
# Select quantitative columns from the `tinder` DataFrame
tinder_quantitative <- tinder %>%
select(-all_of(qualitative))
head(tinder_quantitative)
## score n.matches n.updates.photo n.photos sent.ana length.prof
## 1 1.495834 11 5 6 6.490446 0.00000
## 2 8.946863 56 2 6 4.589125 20.72286
## 3 2.496199 13 3 4 6.473182 31.39928
## 4 2.823579 32 5 2 5.368982 0.00000
## 5 2.117433 21 1 4 5.573949 38.51022
## 6 1.700014 14 2 6 5.464667 23.11221
tinder_qualitative_df <- as.data.frame(tinder_qualitative)
tinder_quantitative_df <- as.data.frame(tinder_quantitative)
# Convert columns to factors
tinder_qualitative <- tinder_qualitative %>%
mutate(across(everything(), as.factor))
Preforming a pca on the quantitative variables of the data set:
tinder_pca<-prcomp(tinder_quantitative,scale=TRUE)
Seeing the number of principle components needed to do analysis:
# Compute variance and proportion of variance explained
tinder_var <- tinder_pca$sdev^2
tinder_pve <- tinder_var / sum(tinder_var)
# Create elbow plot
plot(tinder_pve,
type = "b",
xlab = "# of components",
ylab = "% Variance explained",
ylim = c(0, 1),
main = "Elbow Plot of PCA",
pch = 19, col = "blue")
The elbow plot doesn’t show a clear inflection point. However, the
variance explained by 2 principle components only sums to about 60%.
This suggests that we shouldn’t reduce the dimensions of the
dataset.
Preforming MCA:
# Perform MCA
head(tinder_qualitative)
## photo.beach photo.elevator laugh voyage gender
## 1 0 0 0 0 1
## 2 1 0 0 0 1
## 3 1 0 0 0 1
## 4 1 0 0 0 0
## 5 0 0 1 0 0
## 6 0 0 0 0 1
tinder_mca <- MCA(tinder_qualitative, graph = FALSE)
#projecting components on pc 1 and 2
fviz_pca_var(tinder_mca, col.var = "red",xlab='PC 1',ylab='PC 2')
Here we several vectors whose magnitude projection onto an axis represents the contribution a factor has on a specific principle component. Here we see values such Gender_1 and photo.beach_1 having significance on PC 1 while having no bearing on PC 2. Vectors with smaller magnitudes aren’t significant in the graphed PC’s, but this doesn’t mean that they’re irrelevant…
To illustrate this we can print the contribution table:
variable_contributions<-tinder_mca$var$contrib
# Subsetting the first 2 pc's and converting to a df to print neatly
contribution_table <- as.data.frame(variable_contributions[, 1:5])
print(contribution_table, row.names = TRUE)
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## photo.beach_0 5.4946973 0.003399576 0.2758844 0.03539420 5.95022575
## photo.beach_1 26.5132483 0.016403777 1.3312091 0.17078561 28.71128346
## photo.elevator_0 3.5745436 0.097037421 0.3207350 0.59363478 9.51946387
## photo.elevator_1 18.7199073 0.508185580 1.6796910 3.10886904 49.85349168
## laugh_0 0.1354581 5.810312265 6.7772077 6.74330511 0.09904889
## laugh_1 0.5568318 23.884639684 27.8592884 27.71992373 0.40716348
## voyage_0 0.3289347 8.641332018 0.5184313 11.93898139 0.57345266
## voyage_1 1.1549812 30.342120694 1.8203566 41.92108505 2.01355183
## gender_0 21.6338805 0.003883415 0.1540573 0.03549821 0.20269683
## gender_1 20.7507929 1.062594971 3.6202096 0.13053197 0.01801013
## gender_2 1.1367242 29.630090600 55.6429296 7.60199090 2.65161141
fviz_contrib(tinder_mca, choice = "var", axes = 1:5)
This table shows the percentage of information contributed to each PC by
each variable. Going off of the variable Gender_2, we can see that it
composes ~1% of PC1 but 55% of pc2’s variance.
Looking more broadly, we can see the % of variance explained by each principle component. This essentially tells us how many pc’s are needed to model the original data. In this case, if we used 5 pc’s we could model the original data with up to 86% accuracy.
inertia<-as.data.frame(tinder_mca$eig)
print(inertia)
## eigenvalue percentage of variance cumulative percentage of variance
## dim 1 0.2556924 21.30770 21.30770
## dim 2 0.2040141 17.00117 38.30887
## dim 3 0.1988266 16.56888 54.87775
## dim 4 0.1967857 16.39880 71.27655
## dim 5 0.1880749 15.67291 86.94946
## dim 6 0.1566064 13.05054 100.00000
The inertia of the eigenvalues/principle components are relatively similar to the pca.
# Assuming 'tinder_mca' contains your MCA analysis result
fviz_screeplot(tinder_mca, addlabels = TRUE, main = "Elbow Plot for MCA",ylim=c(0,30))
head((tinder_mca$ind$coord))
## Dim 1 Dim 2 Dim 3 Dim 4 Dim 5
## 1 0.2610285 -0.3482267 0.09668760 0.0506516 -0.2076804
## 2 0.9319911 -0.3649160 -0.05365775 0.1045024 0.4905410
## 3 0.9319911 -0.3649160 -0.05365775 0.1045024 0.4905410
## 4 0.3439451 -0.3021685 0.09417473 0.1159881 0.4534615
## 5 -0.4208131 0.3288199 -0.41892582 0.7239216 -0.3249656
## 6 0.2610285 -0.3482267 0.09668760 0.0506516 -0.2076804
A factor mapping on the PC’s show us the significance of each factor on a specific principle component. In real life this tells us that having a laughing photo in your profile doesn’t significantly impact the score of the profile, whereas having a photo from a trip, or beach are more likely to impact profile score.
Factor mapping of qualitative variables: Plotting the points of individuals on a factorial plane:
# Plot individuals
fviz_mca_ind(tinder_mca,
repel = TRUE, # Avoid text overlap
col.ind = "blue", # Color of individuals
geom = "point", # Show points for individuals
title = "Individuals on the Factorial Plane")
Now using our principle components we will cluster the data. In this
scenario, clustering will form groups consisting of profiles with
similar characteristics.
Looking at the graph above, the data seems to be grouped into four groups. Due to the nature of this data, it seems that k-means,hierarchical clustering would be the best method to cluster this data.
Preforming K-means:
#calculating the cluster using kmeans
set.seed(321)
kmeans_qual <- kmeans(tinder_mca$ind$coord, centers = 4)
# Convert cluster assignments to factor (discrete variable)
kmeans_clusters <- factor(kmeans_qual$cluster)
# Visualize K-means clusters on the MCA factorial plane
fviz_mca_ind(tinder_mca,
col.ind = kmeans_clusters, # Color points by clusters (discrete factor)
palette = "jco", # Choose color palette
addEllipses = TRUE, # Add ellipses for clusters
repel = TRUE, # Avoid label overlap
label = "none", # Remove data labels
arrow = FALSE, # Remove arrows
title = "K-means Clustering on MCA")
This grouping doesn’t look right, lets try hierarchical clustering…
# Perform hierarchical clustering (using Euclidean distance and complete linkage)
dist_mca <- dist(tinder_mca$ind$coord) # Compute distance matrix
hclust_qual <- hclust(dist_mca, method = "complete") # Perform hierarchical clustering
# Cut the dendrogram to create 4 clusters
hclust_clusters <- cutree(hclust_qual, k = 4) #
# Convert hierarchical cluster assignments to factor (discrete variable)
hclust_clusters_factor <- factor(hclust_clusters)
# Visualize hierarchical clustering on the MCA factorial plane
fviz_mca_ind(tinder_mca,
col.ind = hclust_clusters_factor, # Color points by clusters
palette = "jco", # Choose color palette
addEllipses = TRUE, # Add ellipses for clusters
repel = TRUE, # Avoid label overlap
label = "none", # Remove data labels
arrow = FALSE, # Remove arrows
title = "Hierarchical Clustering on MCA")
This gives a similar result to the kmeans clustering. To get a better
understanding on why the clustering is the way that it is, we can try
plotting the clusters using 3 pc’s.
# Extract the first three dimensions from the MCA coordinates
mca_3d_coords <- tinder_mca$ind$coord[, 1:3] # Assuming MCA has at least 3 dimensions
# Create a 3D scatter plot
plot <- plot_ly(
x = ~mca_3d_coords[, 1],
y = ~mca_3d_coords[, 2],
z = ~mca_3d_coords[, 3],
type = "scatter3d",
mode = "markers",
color = ~kmeans_clusters, # Color by clusters
colors = "Set2" # Color palette
)
# Add layout
plot <- plot %>% layout(
title = "3D K-means Clustering on MCA",
scene = list(
xaxis = list(title = "Dim 1"),
yaxis = list(title = "Dim 2"),
zaxis = list(title = "Dim 3")
)
)
# Print the plot
plot
In this graph we see that in 3 dimensions that the data doesn’t follow the pattern that graphing in 2d followed. *Note this graph actually contains 3000 observations. Since the data of the MCA is formed by factors, there are many points that overlap.
Now lets analyse on the PCA:
Lets what quantitative variables are correlated:
coor_plot<-ggcorr(
data=tinder_quantitative,
label=TRUE)
print(coor_plot)
The only variables that have a significant relationship are the
“n.matches” and the “score”. This is since score is in part calculated
by using n matches.
Using the MCA we can visualize the contribution of each qualitative variable on the pc’s:
fviz_pca_var(
tinder_pca,col.var = "contrib",gradient.cols=c("blue","purple","red"),rapel=TRUE
)
Here we can once again see the percentge of varience explained by the individual pc’s
inertia_pca<-as.data.frame(tinder_pca$eig)
print(inertia)
## eigenvalue percentage of variance cumulative percentage of variance
## dim 1 0.2556924 21.30770 21.30770
## dim 2 0.2040141 17.00117 38.30887
## dim 3 0.1988266 16.56888 54.87775
## dim 4 0.1967857 16.39880 71.27655
## dim 5 0.1880749 15.67291 86.94946
## dim 6 0.1566064 13.05054 100.00000
To group users, we can plot them on a plane of pc’s. In doing this we can hopefully see a pattern between user characteristics and the number of matches they have.
Plotting People on a factorial plane:
# Plot individuals on the first two dimensions (Dim 1 and Dim 2)
fviz_pca_ind(
tinder_pca,
axes = c(1, 2), # Dimensions to plot
geom.ind = "point", # Use points for individuals
col.ind = tinder$n.matches, # Color by the quality of representation (cos2)
gradient.cols = c("yellow", "orange", "red"), # Color gradient
repel = TRUE # Avoid overlapping labels
) +
ggtitle("Individuals on the Factorial Plane (Dim 1 vs Dim 2)")
Here we can see that there is a positive correlation between dim 1 and
the user score which is represented by the color.
To get more information on users we can cluster them by all principle components.
Lets see how many clusters we should use an elbow plot:
str(tinder_pca)
## List of 5
## $ sdev : num [1:6] 1.531 1.026 0.981 0.935 0.819 ...
## $ rotation: num [1:6, 1:6] 0.5989 0.6116 0.3174 0.0032 0.4068 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : chr [1:6] "score" "n.matches" "n.updates.photo" "n.photos" ...
## .. ..$ : chr [1:6] "PC1" "PC2" "PC3" "PC4" ...
## $ center : Named num [1:6] 1.95 16.78 2.07 3.52 5 ...
## ..- attr(*, "names")= chr [1:6] "score" "n.matches" "n.updates.photo" "n.photos" ...
## $ scale : Named num [1:6] 1.08 10.1 1.53 1.71 2.24 ...
## ..- attr(*, "names")= chr [1:6] "score" "n.matches" "n.updates.photo" "n.photos" ...
## $ x : num [1:3000, 1:6] 0.312 6.163 0.504 2.11 0.186 ...
## ..- attr(*, "dimnames")=List of 2
## .. ..$ : NULL
## .. ..$ : chr [1:6] "PC1" "PC2" "PC3" "PC4" ...
## - attr(*, "class")= chr "prcomp"
pca_coords <- as.data.frame(tinder_pca$x[, 1:6])
fviz_nbclust(pca_coords, kmeans, method = "wss") # Elbow method
I decided to use 6 clusters for the Gaussian Mixture and K-means model so that we get moderate variation between clusters, while maintaining a decent square error. At 6 clusters there is on average a squared error of ~3 matches.
# Load required libraries
library(FactoMineR)
library(factoextra)
# Perform K-means clustering
set.seed(42) # Set seed for reproducibility
kmeans_result <- kmeans(pca_coords, centers = 6, nstart = 250) # Adjust 'centers' as needed
## Warning: did not converge in 10 iterations
# View clustering results
print(kmeans_result)
## K-means clustering with 6 clusters of sizes 400, 613, 450, 599, 523, 415
##
## Cluster means:
## PC1 PC2 PC3 PC4 PC5 PC6
## 1 2.7450964 -0.2532806 -0.04801216 0.33038799 0.27276275 -0.01576899
## 2 -1.0223461 -0.3415785 -0.90078490 0.44907999 0.08422322 -0.01472560
## 3 0.4817928 -0.1383272 -0.84011420 -0.94089828 -0.28145885 0.01510575
## 4 -1.3370468 0.7940561 0.54462685 -0.07391694 0.14623527 -0.02822632
## 5 0.6174048 0.9186106 0.53412828 0.14272677 -0.08436620 0.02872047
## 6 -0.5064069 -1.4051224 0.82857065 -0.03471531 -0.18686471 0.02511702
##
## Clustering vector:
## [1] 3 1 5 3 5 6 2 4 4 1 5 4 2 3 1 3 6 3 2 2 6 4 2 3 4 1 2 3 2 4 2 4 5 3 4 6 4
## [38] 4 3 3 2 3 6 4 6 1 3 4 2 4 6 6 6 2 1 1 1 1 2 4 5 4 2 3 2 2 3 3 2 3 2 6 6 3
## [75] 3 4 3 1 4 3 6 4 2 6 5 5 6 6 1 4 4 5 4 3 4 4 6 6 3 5 2 3 2 3 6 6 4 2 3 5 2
## [112] 5 6 6 5 4 6 4 6 3 6 3 3 2 5 1 2 3 5 1 5 3 2 4 5 6 3 5 1 5 2 6 5 4 2 1 1 1
## [149] 5 6 3 2 6 1 2 4 5 1 2 1 5 2 5 6 1 2 1 1 2 2 2 5 3 1 3 2 3 2 5 1 3 4 1 3 4
## [186] 4 6 2 2 6 4 2 1 2 6 2 6 6 3 6 4 5 2 5 6 2 3 1 2 1 4 6 1 4 4 5 4 2 3 6 6 6
## [223] 1 3 1 1 3 3 2 1 4 4 2 6 2 4 4 1 4 6 4 5 3 2 2 3 4 1 1 3 4 3 2 6 6 2 3 4 1
## [260] 6 3 1 3 5 3 1 3 5 2 6 6 2 1 4 4 2 5 4 4 6 6 5 2 6 1 3 4 1 2 5 4 2 5 6 5 2
## [297] 3 3 6 2 5 1 1 5 5 5 1 2 1 6 2 2 2 2 2 3 2 5 4 3 4 6 4 3 1 5 4 2 2 3 4 1 5
## [334] 4 2 2 4 1 4 3 5 1 1 4 2 4 6 1 5 5 2 3 4 2 5 2 4 5 2 1 2 2 1 5 5 2 3 5 4 4
## [371] 5 2 3 2 4 3 2 5 5 5 5 6 5 5 4 1 4 5 6 5 6 4 1 5 4 1 5 2 4 1 6 2 6 4 6 6 2
## [408] 5 1 6 4 2 2 4 5 6 2 4 4 5 2 3 5 4 1 5 5 4 3 5 1 4 4 5 5 2 3 1 6 2 5 3 6 3
## [445] 2 1 6 3 5 6 6 1 1 3 3 1 5 6 1 4 6 5 5 2 6 5 2 2 4 1 2 4 5 6 4 2 4 4 2 1 5
## [482] 4 3 4 5 3 4 2 3 4 6 3 5 3 6 1 5 2 5 2 2 3 2 6 3 3 3 2 3 6 6 6 5 3 2 2 2 4
## [519] 5 4 2 5 6 2 4 4 4 1 4 5 3 5 4 2 3 1 1 2 6 2 4 2 3 5 6 4 3 2 3 6 5 3 3 2 4
## [556] 6 6 5 4 2 4 5 4 5 2 2 1 2 6 5 3 4 1 4 4 4 1 1 4 2 3 6 4 2 2 4 1 2 2 1 1 2
## [593] 2 5 1 2 4 3 2 4 3 2 3 4 5 4 4 6 4 3 1 2 2 3 2 2 5 1 1 6 5 4 6 3 3 3 2 4 1
## [630] 2 6 5 3 4 2 1 2 5 3 6 6 4 1 4 3 4 3 4 6 3 4 5 6 4 4 2 4 6 2 5 6 1 6 5 6 6
## [667] 4 5 4 5 3 2 3 2 1 4 1 5 4 1 1 2 4 4 5 4 2 1 1 3 2 4 3 4 3 3 4 6 2 1 4 3 2
## [704] 4 6 4 4 2 6 2 2 4 6 4 4 5 6 2 1 3 2 6 3 4 5 5 5 3 5 2 1 5 1 6 4 4 1 1 1 4
## [741] 5 2 5 4 4 6 2 2 4 4 5 4 4 5 3 2 4 5 2 1 3 6 3 2 3 3 5 2 5 2 3 4 5 6 5 3 1
## [778] 3 6 4 1 3 5 1 2 5 1 6 5 2 2 6 5 1 4 5 6 4 5 2 4 2 5 4 2 6 1 2 2 6 5 3 1 4
## [815] 6 1 1 6 1 4 5 5 4 4 4 2 4 3 2 6 5 1 6 1 2 5 1 5 2 5 2 4 5 1 1 5 3 4 2 2 2
## [852] 5 1 4 6 5 1 4 4 3 2 4 2 5 1 5 4 1 4 2 2 2 4 1 6 6 4 2 2 2 2 4 2 5 2 4 6 5
## [889] 3 6 1 1 6 2 2 6 4 3 6 3 6 2 1 4 3 1 5 1 2 1 5 2 5 4 4 1 3 4 2 4 3 4 4 5 4
## [926] 6 5 2 3 2 5 2 6 5 4 5 1 5 5 3 4 5 1 5 4 4 3 1 3 5 3 1 6 3 5 5 2 4 1 2 2 6
## [963] 4 4 3 6 5 3 5 4 1 5 4 4 4 2 4 1 6 2 5 5 1 2 3 4 4 5 5 2 2 2 5 5 6 5 4 5 2
## [1000] 2 1 5 6 5 3 6 6 4 4 2 2 3 4 6 6 6 5 1 1 3 5 4 2 5 2 2 1 6 4 6 6 4 4 4 2 1
## [1037] 6 6 4 4 3 5 5 5 2 3 2 1 1 6 4 3 5 6 4 1 1 5 5 4 6 4 5 3 2 1 2 6 3 6 2 2 2
## [1074] 6 1 2 2 2 3 2 4 3 5 1 5 2 1 4 5 4 2 6 5 2 5 2 5 3 3 5 4 5 1 1 5 6 6 6 2 1
## [1111] 1 2 2 2 1 2 6 1 5 1 6 2 4 6 4 5 3 3 6 2 1 2 5 4 6 2 5 3 6 4 4 4 3 1 2 2 3
## [1148] 1 5 4 3 5 4 1 4 5 4 3 3 6 5 6 6 5 6 4 4 4 3 4 2 4 5 5 5 1 1 2 6 3 4 6 2 1
## [1185] 4 6 6 2 5 6 5 4 2 4 2 3 1 6 3 2 4 5 1 4 1 5 5 2 3 2 2 1 3 6 5 4 3 2 2 1 6
## [1222] 2 4 2 1 5 3 6 2 4 2 2 4 4 6 4 4 3 2 4 5 5 4 2 4 1 5 2 2 4 3 3 6 2 6 2 5 3
## [1259] 4 5 2 6 5 4 5 5 3 5 3 1 6 4 3 5 6 1 5 4 3 1 6 6 4 5 4 6 5 5 6 3 4 5 2 1 6
## [1296] 6 5 1 4 4 3 5 3 4 2 4 3 2 4 2 5 4 5 3 5 3 2 3 6 6 4 4 2 2 5 5 2 5 3 1 4 4
## [1333] 2 4 1 3 3 2 6 4 2 1 4 5 2 5 5 4 2 1 4 5 6 4 5 4 2 2 4 4 1 2 5 6 4 1 1 5 4
## [1370] 6 6 1 3 5 3 6 2 2 3 1 4 3 3 5 6 2 4 4 2 3 4 4 6 5 2 2 3 5 6 3 2 1 2 6 6 2
## [1407] 2 2 6 5 2 5 2 4 4 5 6 4 2 2 5 5 5 6 6 3 1 4 5 5 5 1 4 5 2 5 1 6 6 1 6 6 3
## [1444] 2 6 3 3 5 4 4 2 5 5 4 5 2 3 5 3 2 1 5 3 6 3 5 4 2 4 5 4 5 2 2 2 1 5 6 2 2
## [1481] 1 3 2 6 1 2 6 3 6 4 5 4 2 2 3 6 3 2 3 1 1 2 2 1 3 2 2 2 5 3 5 1 2 3 5 2 5
## [1518] 2 4 5 6 3 1 2 2 3 1 5 6 1 1 3 3 4 3 3 3 6 2 3 2 3 4 5 4 5 2 1 2 1 4 1 4 6
## [1555] 4 3 2 4 6 5 6 4 3 4 2 5 2 4 4 2 2 5 3 3 5 4 1 3 4 4 5 2 1 2 2 4 6 1 6 1 6
## [1592] 1 2 2 4 4 4 1 2 2 4 5 4 4 1 5 6 6 5 4 5 4 4 3 4 4 1 4 2 1 2 5 5 1 3 6 4 4
## [1629] 5 3 1 5 5 2 5 2 2 6 4 3 5 5 6 4 3 1 3 1 3 4 4 4 3 2 4 1 3 5 5 4 3 6 4 6 3
## [1666] 3 4 4 1 3 3 2 3 2 3 3 3 2 1 5 1 4 5 1 5 1 2 4 6 1 1 2 4 2 5 2 5 2 6 1 3 2
## [1703] 1 4 3 4 2 3 4 1 3 4 5 6 6 4 3 6 4 5 2 1 5 6 3 5 5 6 1 2 6 1 6 4 6 2 4 4 1
## [1740] 1 5 6 2 4 5 1 5 1 1 6 4 1 1 4 5 4 5 6 4 5 5 4 5 6 4 4 5 4 1 1 5 2 4 4 6 5
## [1777] 2 4 6 4 1 2 2 1 1 3 3 2 4 4 5 2 4 1 1 1 3 2 5 4 6 2 4 3 5 4 4 5 1 5 5 5 5
## [1814] 2 2 5 2 4 3 5 3 2 4 5 4 2 2 4 4 3 4 5 6 2 3 4 4 2 2 4 3 5 4 3 4 2 2 2 5 1
## [1851] 6 3 5 3 6 3 6 2 5 2 3 2 3 2 3 5 5 6 2 1 6 4 3 3 4 5 4 1 4 6 5 2 6 5 6 2 4
## [1888] 2 3 1 1 3 1 4 1 2 6 4 1 6 3 2 3 2 6 2 4 4 2 6 4 4 4 4 5 2 2 5 3 1 5 6 3 5
## [1925] 2 5 6 6 5 2 6 3 6 1 2 4 1 4 2 4 3 6 2 2 3 5 5 2 2 5 5 4 1 5 6 3 2 2 2 5 3
## [1962] 6 3 2 2 3 2 6 4 6 5 3 6 6 5 2 5 1 1 2 4 1 6 2 6 4 1 2 3 5 3 6 4 3 4 3 2 2
## [1999] 3 4 3 3 5 2 4 4 2 4 4 2 6 1 6 1 5 6 2 1 5 3 2 4 5 3 2 1 3 4 3 6 5 5 4 3 6
## [2036] 2 1 1 5 6 6 3 3 5 3 2 4 6 3 6 1 1 1 3 6 5 3 4 3 4 5 6 3 2 3 6 2 5 3 3 4 4
## [2073] 5 4 5 3 2 6 4 4 6 1 6 3 5 5 2 2 4 2 3 4 6 3 3 2 1 4 4 1 2 1 4 1 4 2 6 6 3
## [2110] 4 5 1 1 6 3 2 5 3 3 2 6 2 1 5 1 5 4 2 6 2 2 2 1 4 1 3 5 6 2 5 2 2 2 6 4 2
## [2147] 4 6 5 1 6 4 5 3 2 2 1 2 5 3 3 5 1 5 4 2 2 6 6 2 5 4 5 6 2 4 4 2 3 6 2 6 2
## [2184] 4 2 6 4 5 5 4 6 1 3 3 2 4 4 6 4 6 2 2 2 2 4 6 1 4 1 6 5 4 3 5 3 6 5 3 4 5
## [2221] 6 4 4 2 3 4 4 4 5 2 4 6 6 6 6 4 4 2 3 6 3 3 2 2 3 4 6 3 1 6 4 2 4 1 6 3 4
## [2258] 4 3 1 6 5 6 4 3 3 1 6 3 1 6 5 5 3 1 1 3 2 4 3 3 1 2 2 1 6 1 6 2 3 3 1 1 2
## [2295] 4 3 4 5 2 3 5 2 6 1 4 2 3 5 4 2 2 2 3 4 5 4 1 1 5 1 5 6 4 5 6 6 5 5 4 2 2
## [2332] 1 2 6 2 6 4 3 2 4 3 6 4 2 4 6 5 3 5 4 4 4 3 5 2 1 3 5 6 6 4 5 1 2 3 3 6 2
## [2369] 2 1 3 1 3 3 5 3 4 5 5 1 5 5 4 5 1 2 3 6 3 1 3 4 4 2 2 1 5 5 2 3 2 6 1 2 5
## [2406] 5 1 4 3 1 4 2 1 5 3 2 1 2 2 5 4 4 4 1 5 2 5 3 2 4 4 2 1 4 6 3 5 4 2 3 2 1
## [2443] 2 6 6 5 5 6 3 2 4 4 4 2 2 4 4 1 5 1 4 6 1 3 2 3 2 2 5 5 1 2 5 1 6 4 2 1 4
## [2480] 6 3 4 6 5 5 2 4 6 1 1 4 3 5 2 2 5 5 5 2 1 6 2 3 6 4 5 1 3 4 2 3 5 4 4 2 6
## [2517] 5 4 3 3 4 4 1 3 4 5 1 3 3 2 2 1 6 6 6 6 5 2 2 4 5 3 5 4 2 6 2 6 4 1 4 3 3
## [2554] 3 3 3 3 3 2 6 4 2 5 2 4 6 2 1 4 2 5 1 4 1 4 6 5 5 4 6 2 1 2 4 5 6 3 3 1 4
## [2591] 3 5 3 3 2 3 2 4 6 2 3 5 4 4 3 4 4 4 4 6 2 5 5 4 2 5 3 2 2 1 1 2 5 5 6 2 2
## [2628] 4 2 4 3 4 4 6 5 4 2 6 4 2 1 2 6 5 4 4 4 5 2 2 3 6 1 1 2 2 1 2 3 2 2 3 5 5
## [2665] 5 1 5 3 5 1 5 2 3 4 6 4 2 2 4 5 2 2 5 2 1 2 2 1 5 4 6 4 6 2 3 4 6 3 6 3 2
## [2702] 4 1 4 1 6 6 1 2 4 3 6 3 1 2 2 3 5 5 1 3 1 1 5 5 5 5 1 3 1 5 5 6 1 1 3 6 3
## [2739] 2 2 3 3 1 4 5 5 5 1 2 1 3 5 6 1 2 2 1 3 1 5 5 1 3 1 5 3 4 5 1 1 4 4 2 4 3
## [2776] 1 6 3 4 1 5 6 2 6 3 2 4 4 6 3 2 4 5 1 6 6 1 3 5 2 6 2 2 3 1 2 1 6 5 4 2 1
## [2813] 5 2 6 2 6 3 2 4 4 2 6 4 1 4 5 2 3 4 4 4 3 2 5 1 5 4 2 3 6 5 6 5 6 4 3 3 2
## [2850] 5 6 5 1 4 3 3 5 5 4 2 3 1 5 5 1 5 4 4 2 5 6 2 5 6 4 4 1 5 3 2 2 4 2 5 4 6
## [2887] 5 4 5 2 3 1 4 6 4 2 5 4 4 2 2 5 2 6 4 3 6 6 6 5 3 3 2 3 6 6 5 3 3 2 2 1 6
## [2924] 5 5 6 6 2 4 4 6 5 4 4 1 3 1 3 5 2 4 2 5 4 5 4 3 2 5 2 5 4 5 2 1 5 4 6 1 6
## [2961] 4 4 1 2 4 3 4 2 2 1 5 2 4 1 5 4 3 5 6 5 2 1 5 5 2 4 3 4 2 2 5 3 1 3 2 3 5
## [2998] 1 1 5
##
## Within cluster sum of squares by cluster:
## [1] 1877.886 1363.571 1453.163 1463.862 1423.827 1421.426
## (between_SS / total_SS = 50.0 %)
##
## Available components:
##
## [1] "cluster" "centers" "totss" "withinss" "tot.withinss"
## [6] "betweenss" "size" "iter" "ifault"
# Add clusters to the PCA result for visualization
fviz_pca_ind(
tinder_pca,
geom.ind = "point", # Use points to represent individuals
col.ind = as.factor(kmeans_result$cluster), # Color by clusters
palette = "Set2", # Use a color palette
addEllipses = TRUE, # Add confidence ellipses for clusters
legend.title = "Clusters" # Title for the legend
) +
ggtitle("K-Means Clustering on PCA Coordinates")
gmm_result <- Mclust(pca_coords, G = 4) # assigning 4 clusters
pca_coords$Cluster <- as.factor(gmm_result$classification) # For GMM
Comparing to another clustering method…
ggplot(pca_coords, aes(x = PC1, y = PC2, color = Cluster)) +
geom_point(size = 2, alpha = 0.6) +
stat_ellipse(aes(fill = Cluster), alpha = 0.2, geom = "polygon") +
scale_color_brewer(palette = "Set1") + # Color scheme
scale_fill_brewer(palette = "Set1") + # Matching fill colors for ellipses
labs(
title = "Gaussian Mixture Model Clusters with Ellipses",
x = "Principal Component 1 (PC1)",
y = "Principal Component 2 (PC2)",
color = "Cluster",
fill = "Cluster"
) +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5),
legend.position = "right"
)
Now we can plot the average measurements for each cluster…
cluster_summary <- tinder_quantitative %>%
group_by(tinder_quantitative$Cluster) %>%
summarise(across(where(is.numeric), mean, na.rm = TRUE))%>%
arrange(desc(n.matches))
## Warning: There was 1 warning in `summarise()`.
## ℹ In argument: `across(where(is.numeric), mean, na.rm = TRUE)`.
## Caused by warning:
## ! The `...` argument of `across()` is deprecated as of dplyr 1.1.0.
## Supply arguments directly to `.fns` through an anonymous function instead.
##
## # Previously
## across(a:b, mean, na.rm = TRUE)
##
## # Now
## across(a:b, \(x) mean(x, na.rm = TRUE))
print(cluster_summary)
## # A tibble: 1 × 6
## score n.matches n.updates.photo n.photos sent.ana length.prof
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1.95 16.8 2.07 3.52 5.00 16.1
These clusters reinforce that the only quantitative factors (from this dataset) that can be used to linearly predict the # of matches of a user is sent.ana, and n.updates.photo. However, it seems that there may be a non linear relationship between n.photos and score.